import docx
import re
import os

from name_sep import extract_name_from_text

def extract_docx_text(docFile):
    document = docx.Document(docFile)
    docx_text = ""
    for para in document.paragraphs:
        docx_text += para.text + '\n'
    return docx_text

dict_path = r'C:\Users\yyds\Desktop\new_demo'
docFile = os.path.join(dict_path, "519038.docx")
text = extract_docx_text(docFile)
lines = text.split('\n')

divider2 = "--------------------------------------------------------------------------------"

# 处理整个文档（不再限制 lines[:1125]）
# 正确过滤：去掉空行 和 divider2 行
filtered_lines = [
    line for line in lines
    if line.strip() != "" and line.strip() != divider2
]

# 时间戳正则（匹配如：2025-10-22 14:30 ...）
time_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}')

# 提取姓名
name = extract_name_from_text(text)
print("姓名：" + name)

# 生成输出文件名
output_txt_file = os.path.join(dict_path, f'519038{name}.txt')

with open(output_txt_file, 'w', encoding='utf-8') as f:
    for i, line in enumerate(filtered_lines):
        if time_pattern.match(line):
            f.write(f"{i}. {line}\n")

print("处理完成！")